Problem Statement:Data Description:Evaluation Metric:# to deal with dataframes
import numpy as np
import pandas as pd
pd.set_option('display.max_columns',None)
pd.set_option('display.expand_frame_repr',False)
pd.set_option('display.max_colwidth',-1)
# for visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')
# for text preprocessing
import re
from wordcloud import WordCloud
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
import unicodedata
from textblob import TextBlob
import emoji
# for machine learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.classifier import StackingClassifier
# factors dataset
train_1=pd.read_csv('train_factors.csv')
test_1=pd.read_csv('test_factors.csv')
.json file into .csv file.# train set
#json_train=pd.read_json("train_data.json")
#a=[]
#b=[]
#c=[]
#d=[]
#for i in json_train.records:
# d.append(i['sentiment_score'])
# c.append(i['stocktwit_tweet'])
# b.append(pd.to_datetime(i['timestamp']))
# a.append(i['ticker'])
#train_2=pd.DataFrame({'ticker':a,'timestamp':b,'tweet':c,'sentiment':d})
#train_2.to_csv('train_2.csv')
# test set
#json_test=pd.read_json("test_data.json")
#a=[]
#b=[]
#c=[]
#for i in json_test.records:
# c.append(i['stocktwit_tweet'])
# b.append(pd.to_datetime(i['timestamp']))
# a.append(i['ticker'])
#test_2=pd.DataFrame({'ticker':a,'timestamp':b,'tweet':c})
#test_2.to_csv('test_2.csv')
# tweet dataset
train_2=pd.read_csv('train_2.csv')
test_2=pd.read_csv('test_2.csv')
train set
train_1.head()
train_2.head()
train_1.info()
train_2.info()
There are no missing values both train sets.
train_1.describe(include='all')
train_2.describe(include='all')
test set
test_1.head()
test_2.head()
test_1.info()
test_2.info()
There are no missing values both test sets.
test_1.describe(include='all')
test_2.describe(include='all')
# replacing / with -
train_1['date']=train_1['date'].str.replace("/","-")
test_1['date']=test_1['date'].str.replace("/","-")
# changing the datatype
train_1['date']=pd.to_datetime(train_1['date'])
test_1['date']=pd.to_datetime(test_1['date'])
train_2=train_2.rename(columns={"Unnamed: 0":"Id"})
test_2=test_2.rename(columns={"Unnamed: 0":"Id"})
# changing the datatype
train_2['timestamp']=pd.to_datetime(train_2['timestamp'])
test_2['timestamp']=pd.to_datetime(test_2['timestamp'])
# extracting date and saving in new column
train_2['date']=train_2['timestamp'].dt.strftime('%d/%m/%y')
test_2['date']=test_2['timestamp'].dt.strftime('%d/%m/%y')
# changing the datatype
train_2['date']=pd.to_datetime(train_2['date'])
test_2['date']=pd.to_datetime(test_2['date'])
# removing timestamp column
train_2=train_2.drop(columns=['timestamp'])
test_2=test_2.drop(columns=['timestamp'])
train_2['tweet']=train_2['tweet'].str.replace('http\S*|www\S*',"",case=False)
test_2['tweet']=test_2['tweet'].str.replace('http\S*|www\S*',"",case=False)
def remove_emoji(tweet):
emoji_pattern=re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+",flags=re.UNICODE)
return emoji_pattern.sub(r'',tweet)
train_2['tweet']=train_2['tweet'].apply(remove_emoji)
test_2['tweet']=test_2['tweet'].apply(remove_emoji)
def remove_numbers(tweet):
tweet=''.join([i for i in tweet if not i.isdigit()])
return tweet
train_2['tweet']=train_2['tweet'].apply(remove_numbers)
test_2['tweet']=test_2['tweet'].apply(remove_numbers)
y_train=[]
[y_train.append(" ".join(filter(lambda y_train:y_train[0]!='$',train_2['tweet'][i].split()))) for i in range(0,train_2.shape[0])]
train_2['tweet']=y_train
y_test=[]
[y_test.append(" ".join(filter(lambda y_test:y_test[0]!='$',test_2['tweet'][i].split()))) for i in range(0,test_2.shape[0])]
test_2['tweet']=y_test
train_2['tweet']=train_2['tweet'].str.replace("@[\w]*","")
test_2['tweet']=test_2['tweet'].str.replace("@[\w]*","")
train_2['tweet']=train_2['tweet'].str.replace("[^a-zA-Z]"," ")
test_2['tweet']=test_2['tweet'].str.replace("[^a-zA-Z]"," ")
train_2['tweet']=train_2['tweet'].apply(lambda x : re.sub(r'(.)\1{1,}', r'\1\1',x))
test_2['tweet']=test_2['tweet'].apply(lambda x : re.sub(r'(.)\1{1,}', r'\1\1',x))
def remove_accented_chars(tweet):
tweet=unicodedata.normalize('NFKD',tweet).encode('ascii','ignore').decode('utf-8','ignore')
return tweet
train_2['tweet']=train_2['tweet'].apply(remove_accented_chars)
test_2['tweet']=test_2['tweet'].apply(remove_accented_chars)
train_2['tweet']=train_2['tweet'].str.lower()
test_2['tweet']=test_2['tweet'].str.lower()
CONTRACTION_MAP={
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}
def expand_contractions(tweet,contraction_mapping=CONTRACTION_MAP):
contractions_pattern=re.compile('({})'.format('|'.join(contraction_mapping.keys())),
flags=re.IGNORECASE|re.DOTALL)
def expand_match(contraction):
match=contraction.group(0)
first_char=match[0]
expanded_contraction = contraction_mapping.get(match)\
if contraction_mapping.get(match)\
else contraction_mapping.get(match.lower())
expanded_contraction = first_char+expanded_contraction[1:]
return expanded_contraction
expanded_tweet=contractions_pattern.sub(expand_match,tweet)
expanded_tweet=re.sub("'","",expanded_tweet)
return expanded_tweet
train_2['tweet']=train_2['tweet'].apply(expand_contractions)
test_2['tweet']=test_2['tweet'].apply(expand_contractions)
tokenizer=ToktokTokenizer()
stopword_list=nltk.corpus.stopwords.words('english')
def remove_stopwords(tweet,is_lower_case=False):
tokens=tokenizer.tokenize(tweet)
tokens=[token.strip() for token in tokens]
filtered_tokens=[token for token in tokens if token not in stopword_list]
filtered_tweet= ' '.join(filtered_tokens)
return filtered_tweet
train_2['tweet']=train_2['tweet'].apply(remove_stopwords)
test_2['tweet']=test_2['tweet'].apply(remove_stopwords)
Lemmatization is the process of grouping together the different inflected forms of a word so they can be analysed as a single item. Lemmatization is similar to Stemming but it brings context to the words. So it links words with similar meaning to one word.
Text preprocessing includes both Stemming as well as Lemmatization. Actually, Lemmatization is preferred over Stemming because Lemmatization does morphological analysis of the words.
# splitting the text (tokenizing)
tokenized_data_train=train_2['tweet'].apply(lambda x: x.split())
tokenized_data_test=test_2['tweet'].apply(lambda x: x.split())
lemmatizer=WordNetLemmatizer()
# lemmatizing the tokenized text
tokenized_data_train=tokenized_data_train.apply(lambda x: [lemmatizer.lemmatize(i) for i in x])
tokenized_data_test=tokenized_data_test.apply(lambda x: [lemmatizer.lemmatize(i) for i in x])
# again joining the tokenized text
for i in range(len(tokenized_data_train)):
tokenized_data_train[i]=' '.join(tokenized_data_train[i])
for i in range(len(tokenized_data_test)):
tokenized_data_test[i]=' '.join(tokenized_data_test[i])
# saving the clean text
train_2['tweet']=tokenized_data_train
test_2['tweet']=tokenized_data_test
train_2.head()
test_2.head()
SF1
fig,(axis1,axis2)=plt.subplots(2,1,figsize=(8,6))
train_1['SF1'].plot(kind='hist',bins=30,ax=axis1);
axis1.set_title("Train",size=12);
test_1['SF1'].plot(kind='hist',bins=30,ax=axis2);
axis2.set_title("Test",size=12);
SF2
fig,(axis1,axis2)=plt.subplots(2,1,figsize=(8,6))
train_1['SF2'].plot(kind='hist',bins=30,ax=axis1);
axis1.set_title("Train",size=12);
test_1['SF2'].plot(kind='hist',bins=30,ax=axis2);
axis2.set_title("Test",size=12);
SF3
fig,(axis1,axis2)=plt.subplots(2,1,figsize=(8,6))
train_1['SF3'].plot(kind='hist',bins=30,ax=axis1);
axis1.set_title("Train",size=15);
test_1['SF3'].plot(kind='hist',bins=30,ax=axis2);
axis2.set_title("Test",size=15);
SF4
fig,(axis1,axis2)=plt.subplots(2,1,figsize=(8,6))
train_1['SF4'].plot(kind='hist',bins=30,ax=axis1);
axis1.set_title("Train",size=15);
test_1['SF4'].plot(kind='hist',bins=30,ax=axis2);
axis2.set_title("Test",size=15);
SF5
fig,(axis1,axis2)=plt.subplots(2,1,figsize=(8,6))
train_1['SF5'].plot(kind='hist',bins=30,ax=axis1);
axis1.set_title("Train",size=15);
test_1['SF5'].plot(kind='hist',bins=30,ax=axis2);
axis2.set_title("Test",size=15);
SF6
fig,(axis1,axis2)=plt.subplots(2,1,figsize=(8,6))
train_1['SF6'].plot(kind='hist',bins=30,ax=axis1);
axis1.set_title("Train",size=15);
test_1['SF6'].plot(kind='hist',bins=30,ax=axis2);
axis2.set_title("Test",size=15);
SF7
fig,(axis1,axis2)=plt.subplots(2,1,figsize=(8,6))
train_1['SF7'].plot(kind='hist',bins=30,ax=axis1);
axis1.set_title("Train",size=15);
test_1['SF7'].plot(kind='hist',bins=30,ax=axis2);
axis2.set_title("Test",size=15);
alpha
fig,ax=plt.subplots(figsize=(8,6));
sns.countplot(x='alpha',data=train_1,saturation=1);
ax.set_title("Distribution of Alpha Values",size=15);
SF1-7 with alpha
sns.pairplot(train_1,vars=['SF1','SF2','SF3','SF4','SF5','SF6','SF7'],hue='alpha');
sentiment
fig,ax=plt.subplots(figsize=(8,6));
sns.countplot(x='sentiment',data=train_2,saturation=1,palette='gist_rainbow');
ax.set_title("Distribution of Sentiment Values",size=15);
tweet_length in tweet as per sentiment.
def length(tweet):
'''a function which returns the length of text'''
return len(tweet)
train_2['tweet_length']=train_2['tweet'].apply(length)
plt.rcParams['figure.figsize']=(14,7)
bins=40
plt.hist(train_2[train_2['sentiment']==0]['tweet_length'],alpha=1.0,bins=bins,label='Sentiment 0')
plt.hist(train_2[train_2['sentiment']==1]['tweet_length'],alpha=0.8,bins=bins,label='Sentiment 1')
plt.hist(train_2[train_2['sentiment']==2]['tweet_length'],alpha=0.6,bins=bins,label='Sentiment 2')
plt.hist(train_2[train_2['sentiment']==3]['tweet_length'],alpha=0.4,bins=bins,label='Sentiment 3')
plt.hist(train_2[train_2['sentiment']==4]['tweet_length'],alpha=0.2,bins=bins,label='Sentiment 4')
plt.xlabel('Tweet Length')
plt.ylabel('Number of Characters')
plt.title('Tweet Length as per Sentiment')
plt.legend(loc='upper right')
plt.grid()
plt.show()
Wordcloud:
It is a visual representation of the words used in a particular piece of text, with the size of each word indicating its relative frequency.
So here we will make different wordclouds, each will be showing most frequent words for specific sentiment
sentiment 0
recession=train_2.tweet[train_2.sentiment[train_2.sentiment==0].index]
plt.figure(figsize=(18,32))
wordcloud=WordCloud(min_font_size=2,max_words=200,width=1920,height=1080,collocations=False,colormap='Reds').generate(" ".join(recession))
plt.imshow(wordcloud,interpolation='bilinear');
plt.axis('off');
sentiment 1
recession=train_2.tweet[train_2.sentiment[train_2.sentiment==1].index]
plt.figure(figsize=(18,32))
wordcloud=WordCloud(min_font_size=2,max_words=200,width=1920,height=1080,collocations=False,colormap='Oranges').generate(" ".join(recession))
plt.imshow(wordcloud,interpolation='bilinear');
plt.axis('off');
sentiment 2
recession=train_2.tweet[train_2.sentiment[train_2.sentiment==2].index]
plt.figure(figsize=(18,32))
wordcloud=WordCloud(min_font_size=2,max_words=200,width=1920,height=1080,collocations=False,colormap='Wistia_r').generate(" ".join(recession))
plt.imshow(wordcloud,interpolation='bilinear');
plt.axis('off');
sentiment 3
recession=train_2.tweet[train_2.sentiment[train_2.sentiment==3].index]
plt.figure(figsize=(18,32))
wordcloud=WordCloud(min_font_size=2,max_words=200,width=1920,height=1080,collocations=False,colormap='Greens').generate(" ".join(recession))
plt.imshow(wordcloud,interpolation='bilinear');
plt.axis('off');
sentiment 4
recession=train_2.tweet[train_2.sentiment[train_2.sentiment==4].index]
plt.figure(figsize=(18,32))
wordcloud=WordCloud(min_font_size=2,max_words=200,width=1920,height=1080,collocations=False,colormap='Blues').generate(" ".join(recession))
plt.imshow(wordcloud,interpolation='bilinear');
plt.axis('off');
Now we will be doing our machine learning on tweet dataset, we will generate sentiment for test_2 and then finally we will merge factors and tweet datasets and predict alpha signal.
# splitting our train data into (70%) and validation data into (30%)
X_train,X_val,y_train,y_val=train_test_split(train_2.tweet,train_2.sentiment,test_size=0.30,random_state=44)
TF-IDF:
In information retrieval, TFIDF, short for term frequency – inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.
tv=TfidfVectorizer(max_features=20000)
#transformed train set
tv_train=tv.fit_transform(X_train)
#transformed validation set
tv_val=tv.transform(X_val)
#transformed test set
tv_test=tv.transform(test_2.tweet)
print('tfidf_train:',tv_train.shape)
print('tfidf_validation:',tv_val.shape)
print('tfidf_test:',tv_test.shape)
# defining classifier
mnb=MultinomialNB()
# fitting for tfidf vectorizer.
tfidf_mnb=mnb.fit(tv_train,y_train)
# predicting and printing f1 score for train and validation data
tfidf_train_predict_mnb=tfidf_mnb.predict(tv_train)
tfidf_val_predict_mnb=tfidf_mnb.predict(tv_val)
f1_mnb_tr=f1_score(y_train,tfidf_train_predict_mnb,average='macro')
f1_mnb_val=f1_score(y_val,tfidf_val_predict_mnb,average='macro')
print('Train F1 Score')
print(f1_mnb_tr)
print('Val F1 Score')
print(f1_mnb_val)
# defining classifier
lg=LogisticRegression()
# fitting for tfidf vectorizer.
tfidf_lg=lg.fit(tv_train,y_train)
# predicting and printing f1 score for train and validation data
tfidf_train_predict_lg=tfidf_lg.predict(tv_train)
tfidf_val_predict_lg=tfidf_lg.predict(tv_val)
f1_lg_tr=f1_score(y_train,tfidf_train_predict_lg,average='macro')
f1_lg_val=f1_score(y_val,tfidf_val_predict_lg,average='macro')
print('Train F1 Score')
print(f1_lg_tr)
print('Val F1 Score')
print(f1_lg_val)
# defining classifier
gb=GradientBoostingClassifier()
# fitting for tfidf vectorizer.
tfidf_gb=gb.fit(tv_train,y_train)
# predicting and printing f1 score for train and validation data
tfidf_train_predict_gb=tfidf_gb.predict(tv_train)
tfidf_val_predict_gb=tfidf_gb.predict(tv_val)
f1_gb_tr=f1_score(y_train,tfidf_train_predict_gb,average='macro')
f1_gb_val=f1_score(y_val,tfidf_val_predict_gb,average='macro')
print('Train F1 Score')
print(f1_gb_tr)
print('Val F1 Score')
print(f1_gb_val)
Results
obs_tweet={'Model':['Multinomial Naive Bayes','Logistic Regression',
'Gradient Boosting'],
'F1 Macro on Training':[f1_mnb_tr,f1_lg_tr,
f1_gb_tr],
'F1 Macro on Validation':[f1_mnb_val,f1_lg_val,
f1_gb_val]}
results_tweet=pd.DataFrame(obs_tweet)
results_tweet
We will use logistic regresssion for predicting sentiment for tweet test set and later we will merge sentiment from both train and test to factors train and test set.
# predicting on tweet test set
#y_pred_lg=lg.predict(tv_test)
#test_2['sentiment']=y_pred_lg
#test_2.to_csv('test_2_sentiment.csv',index=False)
test_2_sentiment=pd.read_csv('test_2_sentiment.csv')
test_2_sentiment['date']=pd.to_datetime(test_2_sentiment['date'])
train_1['ticker']=train_1['ticker'].str.lower()
train_2['ticker']=train_2['ticker'].str.lower()
test_1['ticker']=test_1['ticker'].str.lower()
test_2_sentiment['ticker']=test_2_sentiment['ticker'].str.lower()
# grouping by 'date', 'ticker' and mode of 'sentiment'
train_groupby=train_2.groupby(by=["date","ticker"])['sentiment'].max()
test_groupby=test_2_sentiment.groupby(by=["date","ticker"])['sentiment'].max()
# merging
train=pd.merge(train_1,train_groupby,how='left',left_on=['date','ticker'],right_on=['date','ticker'])
test=pd.merge(test_1,test_groupby,how='left',left_on=['date','ticker'],right_on=['date','ticker'])
train['month']=pd.DatetimeIndex(train['date']).month
test['month']=pd.DatetimeIndex(test['date']).month
train['day']=pd.DatetimeIndex(train['date']).day
test['day']=pd.DatetimeIndex(test['date']).day
train=train.drop(columns=['date'])
test=test.drop(columns=['date'])
day and month with SF1
fig,(axis1,axis2)=plt.subplots(2,1,figsize=(15,10))
g=sns.stripplot(x='day',y='SF1',data=train,palette='rainbow',ax=axis1);
axis1.set_title("Day wise SF1",size=15);
g=sns.stripplot(x='month',y='SF1',data=train,ax=axis2);
axis2.set_title("Month wise SF1 ",size=15);
day and month with SF2
fig,(axis1,axis2)=plt.subplots(2,1,figsize=(15,10))
g=sns.stripplot(x='day',y='SF2',data=train,palette='rainbow',ax=axis1);
axis1.set_title("Day wise SF2",size=15);
g=sns.stripplot(x='month',y='SF2',data=train,ax=axis2);
axis2.set_title("Month wise SF2 ",size=15);
day and month with SF3
fig,(axis1,axis2)=plt.subplots(2,1,figsize=(15,10))
g=sns.stripplot(x='day',y='SF3',data=train,palette='rainbow',ax=axis1);
axis1.set_title("Day wise SF3",size=15);
g=sns.stripplot(x='month',y='SF3',data=train,ax=axis2);
axis2.set_title("Month wise SF3 ",size=15);
day and month with SF4
fig,(axis1,axis2)=plt.subplots(2,1,figsize=(15,10))
g=sns.stripplot(x='day',y='SF4',data=train,palette='rainbow',ax=axis1);
axis1.set_title("Day wise SF4",size=15);
g=sns.stripplot(x='month',y='SF4',data=train,ax=axis2);
axis2.set_title("Month wise SF4",size=15);
day and month with SF5
fig,(axis1,axis2)=plt.subplots(2,1,figsize=(15,10))
g=sns.stripplot(x='day',y='SF5',data=train,palette='rainbow',ax=axis1);
axis1.set_title("Day wise SF5",size=15);
g=sns.stripplot(x='month',y='SF5',data=train,ax=axis2);
axis2.set_title("Month wise SF5",size=15);
day and month with SF6
fig,(axis1,axis2)=plt.subplots(2,1,figsize=(15,10))
g=sns.stripplot(x='day',y='SF6',data=train,palette='rainbow',ax=axis1);
axis1.set_title("Day wise SF6",size=15);
g=sns.stripplot(x='month',y='SF6',data=train,ax=axis2);
axis2.set_title("Month wise SF6",size=15);
day and month with SF7
fig,(axis1,axis2)=plt.subplots(2,1,figsize=(15,10))
g=sns.stripplot(x='day',y='SF7',data=train,palette='rainbow',ax=axis1);
axis1.set_title("Day wise SF7",size=15);
g=sns.stripplot(x='month',y='SF7',data=train,ax=axis2);
axis2.set_title("Month wise SF7",size=15);
We can't say anything whether on a specific day or in specific month the value of SF1-7 changes signigicantly.
day month with sentiment
fig,(axis1,axis2)=plt.subplots(2,1,figsize=(15,10))
g=sns.countplot(x='day',hue='sentiment',data=train,palette='prism',ax=axis1);
axis1.set_title("Day wise Sentiment",size=15);
g=sns.countplot(x='month',hue='sentiment',data=train,palette='gist_rainbow',ax=axis2);
axis2.set_title("Month wise Sentiment",size=15);
We can see there is a increase in (count) almost each sentiment value on 7th, 8th, 9th, 10th day and like wise in the month of July, August, September and October.
day and month with alpha
fig,(axis1,axis2)=plt.subplots(2,1,figsize=(15,10))
g=sns.countplot(x='day',hue='alpha',data=train,palette='viridis',ax=axis1);
axis1.set_title("Day wise Alpha",size=15);
g=sns.countplot(x='month',hue='alpha',data=train,palette='magma',ax=axis2);
axis2.set_title("Month wise Alpha",size=15);
Similar pattern can be observed while seeing day and month with alpha, increase in (count) almost each alpha value on 7th, 8th, 9th, 10th day and like wise in the month of July, August, September and October.
sentiment with alpha
fig,ax=plt.subplots(figsize=(8,6));
g=sns.countplot(x='sentiment',hue='alpha',data=train,palette='Wistia',saturation=1);
ax.set_title("Sentiment with Alpha",size=15);
We will impute sentiment column's missing values with median
train.isnull().sum()
train.fillna(train.median(),inplace=True)
dummy_tr_1=pd.get_dummies(train['sentiment'],drop_first=True,prefix='sentiment',prefix_sep='_')
dummy_te_1=pd.get_dummies(test['sentiment'],drop_first=True,prefix='sentiment',prefix_sep='_')
train=pd.concat([train,dummy_tr_1],axis=1)
test=pd.concat([test,dummy_te_1],axis=1)
train=train.drop(columns=['sentiment'])
test=test.drop(columns=['sentiment'])
X=train.drop(columns=['Id','month','day','ticker','alpha'])
y=train['alpha']
test_for_predictions=test.drop(columns=['Id','month','day','ticker'])
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.30,random_state=44)
# defining classifier
GB=GradientBoostingClassifier(n_estimators=300,max_depth=2)
# fitting on train set
GB.fit(X_train,y_train)
# predicting and printing f1 score for train and validation data
y_predict_GB_tr=GB.predict(X_train)
y_predict_GB_val=GB.predict(X_val)
f1_GB_tr=f1_score(y_train,y_predict_GB_tr,average='macro')
f1_GB_val=f1_score(y_val,y_predict_GB_val,average='macro')
print('Train F1 Score')
print(f1_GB_tr)
print('Val F1 Score')
print(f1_GB_val)
# defining classifier
XGB=xgb.XGBClassifier()
# fitting on train set
XGB.fit(X_train,y_train)
# predicting and printing f1 score for train and validation data
y_predict_XGB_tr=XGB.predict(X_train)
y_predict_XGB_val=XGB.predict(X_val)
f1_XGB_tr=f1_score(y_train,y_predict_XGB_tr,average='macro')
f1_XGB_val=f1_score(y_val,y_predict_XGB_val,average='macro')
print('Train F1 Score')
print(f1_XGB_tr)
print('Val F1 Score')
print(f1_XGB_val)
# defining classifier
LGBM=LGBMClassifier(max_depth=2,n_estimators=3000)
# fitting on train set
LGBM.fit(X_train,y_train)
# predicting and printing f1 score for train and validation data
y_predict_LGBM_tr=LGBM.predict(X_train)
y_predict_LGBM_val=LGBM.predict(X_val)
f1_LGBM_tr=f1_score(y_train,y_predict_LGBM_tr,average='macro')
f1_LGBM_val=f1_score(y_val,y_predict_LGBM_val,average='macro')
print('Train F1 Score')
print(f1_LGBM_tr)
print('Val F1 Score')
print(f1_LGBM_val)
# defining classifier
CB=CatBoostClassifier(verbose=False,max_depth=3,n_estimators=3000)
# fitting on train set
CB.fit(X_train,y_train)
# predicting and printing f1 score for train and validation data
y_predict_CB_tr=CB.predict(X_train)
y_predict_CB_val=CB.predict(X_val)
f1_CB_tr=f1_score(y_train,y_predict_CB_tr,average='macro')
f1_CB_val=f1_score(y_val,y_predict_CB_val,average='macro')
print('Train F1 Score')
print(f1_CB_tr)
print('Val F1 Score')
print(f1_CB_val)
Results
obs={'Model':['Gradient Boosting','XGBoost',
'Light GBM','Cat Boost'],
'F1 Macro on Training':[f1_GB_tr,f1_XGB_tr,
f1_LGBM_tr,f1_CB_tr],
'F1 Macro on Validation':[f1_GB_val,f1_XGB_val,
f1_LGBM_val,f1_CB_val]}
results=pd.DataFrame(obs)
results
We will use Cat Boost algorithm to predict on our test set.